In Class Exercise 6

Social Network

Huang Anni (Singapore Management University)
06-05-2022

1. Overview

In this take-home exercise, we reveal the patterns of life in Ohio, USA by creating data visualization with tmap.

With reference to point 2 in Challenge 1 of VAST Challenge 2022, the following questions will be addressed:

Consider the social activities in the community.

Set up

Before we start to draw graphs, there are some work to do:

Install and import packages

To draw social network plots, I use igraph.

devtools::install_github("itsleeds/od", build_vignettes = TRUE)
packages = c('ggiraph', 'plotly', 'tidyverse', 'DT','gganimate',
             'knitr', 'ggdist', 'scales', 'grid', 'gridExtra',
             'patchwork','ggsignif','gghighlight',"hrbrthemes",
             'readxl', 'gifski', 'gapminder','treemap', 'treemapify',
             'rPackedBar','ggridges','rmarkdown','crosstalk',
             'd3scatter','tidycensus','timetk','ggseas','lubridate',
             'ggrepel','doSNOW','data.table','ViSiElse','sf','tmap',
             'clock','dplyr','od','igraph', 'tidygraph', 'ggstatsplot',
             'ggraph', 'visNetwork', 'lubridate', 'clock',
             'tidyverse', 'graphlayouts','FunnelPlotR', 'plotly', 'knitr')

for(p in packages) {
  if(!require(p, character.only = T)) {
    install.packages(p)
  }
  library(p, character.only = T)
}

Read in raw data

The data sets used in this take home exercise is from the social network journals of participants in Ohio City.

There are two data sets. One contains the nodes data and the other contains the edges (also know as link) data.

participants <- read_csv("./raw_data/Attributes/Participants.csv")
social_network <- read_csv("./raw_data/Journals/SocialNetwork.csv")
financial <- read_csv("./raw_data/Journals/FinancialJournal.csv")

Data processing

For participants:

For social graph:

For friends num:

participants$educationLevel<-factor(participants$educationLevel,ordered=TRUE,levels=c('Low','HighSchoolOrCollege',"Bachelors","Graduate"))
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30','31-40', '41-50', '51-60', '>60')
participants$Age_Group <- cut(participants$age, breaks=brks, labels = grps, right = FALSE)
brks <- c(0, 0.3, 0.5, 0.6, 1)
grps <- c('Really Sad', 'Sad','Neutral', 'Happy')
participants$Joviality_Group <- cut(participants$joviality, breaks=brks, labels = grps, right = FALSE)
income_par <- financial %>% 
  filter(category %in% c('Wage')) %>%
  group_by(participantId,month=lubridate::month(timestamp)) %>%
  summarise(wage = round(sum(amount),1)) %>%
  ungroup()%>%
  group_by(participantId) %>%
  summarise(wage = mean(wage)) %>%
  ungroup()
participants <- participants %>%
  inner_join(income_par, by = "participantId")

socialNetwork_edges <- social_network %>%
  group_by(from=participantIdFrom, to=participantIdTo) %>%
  filter(from!=to) %>%
  summarise(weight = n()) %>%
  filter(weight > 1) %>%
  ungroup()

parId_in_socialNetwork <- union(unique(socialNetwork_edges$from),unique(socialNetwork_edges$to)) %>%
  sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
socialNetwork_nodes <- participants %>% 
  inner_join(parId_in_socialNetwork, by = "participantId")
socialNetwork_nodes$id <- socialNetwork_nodes$participantId
socialNetwork_graph <- igraph::graph_from_data_frame(socialNetwork_edges, 
                                                     vertices = socialNetwork_nodes)%>%
  as_tbl_graph()

friends_num_df <- socialNetwork_edges %>%
  group_by(from) %>%
  filter(from!=to) %>%
  group_by(participantId = from) %>%
  summarise(friends_num = n()) %>%
  ungroup() %>%
  inner_join(participants, by = "participantId")

interaction_num_df <- socialNetwork_edges %>%
  group_by(participantId = from) %>%
  filter(participantId!=to) %>%
  summarise(interaction_num = sum(weight)) %>%
  ungroup() %>%
  inner_join(participants, by = "participantId")

top5_most_active<-interaction_num_df %>%
  arrange(desc(interaction_num)) %>%
  slice(1:5)
top5_most_active$id <- top5_most_active$participantId
top5_most_active_nodes <- top5_most_active 
top5_most_active_edges <- social_network %>%
  group_by(from=participantIdFrom, to=participantIdTo) %>%
  filter((from!=to)&
         (from %in% top5_most_active$id)) %>%
  summarise(weight = n()) %>%
  filter(weight > 1) %>%
  ungroup()
parId_in_socialNetwork <- union(unique(top5_most_active_edges$from),unique(top5_most_active_edges$to)) %>%
  sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
top5_most_active_nodes <- participants %>% 
  inner_join(parId_in_socialNetwork, by = "participantId")
top5_most_active_nodes$id <- top5_most_active_nodes$participantId

top5_most_active_graph <- igraph::graph_from_data_frame(top5_most_active_edges, 
                                                     vertices = top5_most_active_nodes)%>%
  as_tbl_graph()

Save the processed data

write_rds(top5_most_active_graph, './data/top5_most_active_graph.rds')
write_rds(top5_most_active_nodes, './data/top5_most_active_nodes.rds')
write_rds(top5_most_active_edges, './data/top5_most_active_edges.rds')
write_rds(interaction_num_df,'./data/interaction_num.rds')
write_rds(socialNetwork_graph,'./data/socialNetwork_graph.rds')
write_rds(socialNetwork_nodes,'./data/socialNetwork_nodes.rds')
write_rds(socialNetwork_edges,'./data/socialNetwork_edges.rds')
write_rds(friends_num_df,'./data/friends_num.rds')

Analyze social network data

Read in processed data

top5_most_active_graph <- read_rds('./data/top5_most_active_graph.rds')
top5_most_active_nodes <- read_rds('./data/top5_most_active_nodes.rds')
top5_most_active_edges <- read_rds('./data/top5_most_active_edges.rds')
interaction_num_df <- read_rds('./data/interaction_num.rds')
social_network <- read_csv("./raw_data/Journals/SocialNetwork.csv")
socialNetwork_graph <- read_rds('./data/socialNetwork_graph.rds')
socialNetwork_nodes <- read_rds('./data/socialNetwork_nodes.rds')
socialNetwork_edges <- read_rds('./data/socialNetwork_edges.rds')
friends_num_df <- read_rds('./data/friends_num.rds')

Is there relationship with people’s education level and social interaction times?

Suppose the first thing we want to inspect is the distribution of the number of social interactions for participants of different education levels. We also want to know if the mean differences in the number of social interaction between different education levels is statistically significant.

I apply ANOVA test to see if there’s any relationship between social interaction tims and people’s education level. We can see that there’s a huge difference between the median social interaction times within different groups. As we can see the median social activeness is positively correlated with degree level. People with higher degree is more active.

Is there relationship with people’s wage and social zone size?

I plotted a scatter plot of the social zone size and people’s wage. Surprisingly, the rich people do not have a big social zone.

p2 <- ggplot(data=friends_num_df, aes(x = wage, 
                                      y = friends_num, 
                                      text =paste("Wage:", round(wage,2),
                                                  "\nNo. of Friends:",friends_num)))+
  geom_point(aes(size=friends_num,color=friends_num), alpha = 1/10) +
  labs(y= 'No. of Interacted People', x= 'Wage',
       title = "Fig2: Relatiobship between wage and social zone size",
       subtitle = "People who has high wage tend to keep a smalll social zone")

ggplotly(p2,tooltip = c("text"))

The top-5 most active people with different Joviality Status

As we can see, people who are happy has a strong connection with each other.

ggraph(top5_most_active_graph,
       layout = "fr") + # random
  geom_edge_link(aes(width=weight,alpha=0.2)) +
  geom_node_point(aes(color=Joviality_Group, 
                      size = 0.3)) +
  theme_void() +# remove gray background +
  facet_nodes(~Joviality_Group) 

top5_edges_aggregated <- top5_most_active_edges %>%
  left_join(top5_most_active_nodes, by =  c("from" = "id")) %>%
  rename(from_JP = Joviality_Group) %>%
  left_join(top5_most_active_nodes, by = c("to" = "id")) %>%
  rename(to_JP = Joviality_Group) %>%
  group_by(from, to) %>%
    summarise(weight = n()) %>%
  filter(from!=to) %>%
  filter(weight > 1) %>%
  ungroup()
top5_most_active_nodes <- top5_most_active_nodes %>%
  rename(group = Joviality_Group)
visNetwork(top5_most_active_nodes,
           top5_edges_aggregated) %>%
  visIgraphLayout(layout = "layout_with_fr") %>%
  visLegend() %>%
  visLayout(randomSeed = 123)